{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ChatGPT Hate Speech Klassifikation I"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pip install openai\n",
    "#!conda install --yes -c anaconda pandas\n",
    "#!conda install --yes -c anaconda scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from openai import OpenAI\n",
    "import time\n",
    "import re\n",
    "\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "notebook_path = os.path.abspath(\"__file__\")\n",
    "notebook_dir = os.path.dirname(notebook_path)\n",
    "print(notebook_dir)\n",
    "\n",
    "path='/Users/Mael/Library/CloudStorage/Dropbox/projects_active/eth_uzh_hatespeech_llm_finetuning/data/'\n",
    "path2='/Users/Mael/Library/CloudStorage/Dropbox/projects_active/eth_uzh_hatespeech_llm_finetuning/annotations/chatGPT/input_zero_shot/'\n",
    "path3='/Users/Mael/Library/CloudStorage/Dropbox/projects_active/eth_uzh_hatespeech_llm_finetuning/instructions_gpt_4o/'\n",
    "path4='/Users/Mael/Library/CloudStorage/Dropbox/projects_active/eth_uzh_hatespeech_llm_finetuning/annotations/chatGPT/output_zero_shot/'\n",
    "\n",
    "df=pd.read_csv(path+'citizen_science_for_annots_27012022.csv')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(500, 5)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#df['check_hs']=df['check_hs'].astype(int)\n",
    "\n",
    "my_text=df['Kommentar'].tolist()\n",
    "\n",
    "df.to_csv(path2 +'chatGPT_data_readtouse.csv')\n",
    "\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path3+'/Instructions_un_definition.txt') as f:\n",
    "    instruction = f.readlines()\n",
    "instruction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_key = '' # Your OpenAI API Key\n",
    "my_key_2 = '' # Your OpenAI API Key\n",
    "my_key_3 = '' # Your OpenAI API Key\n",
    "\n",
    "client = OpenAI(\n",
    "    api_key=my_key_2,  # This is the default and can be omitted\n",
    ")\n",
    "\n",
    "def run_chatgpt_default(instruction,my_text,k):\n",
    "\n",
    "    label=[]\n",
    "    text_list=[]\n",
    "\n",
    "    content=str(instruction)\n",
    "\n",
    "    for Kommentar in my_text:\n",
    "        i=my_text.index(Kommentar)\n",
    "        print(i)\n",
    "\n",
    "        completion = client.openai.chat.completions.create(\n",
    "          model=\"gpt-4o-mini-2024-07-18\",\n",
    "\n",
    "        messages=[\n",
    "          {\n",
    "            \"role\": \"system\",\n",
    "            \"content\": content\n",
    "          },\n",
    "          {\n",
    "            \"role\": \"user\",\n",
    "            \"content\": \"Here's the comment I picked, please label it as instructed as hate speech, or not and if there is hate speech also return the target group: \"+ Kommentar        \n",
    "\n",
    "          }\n",
    "        ]\n",
    "        )\n",
    "        label.append((completion.choices[0].message.content))\n",
    "        text_list.append(text)\n",
    "        print(completion.choices[0].message.content)\n",
    "\n",
    "\n",
    "    df_label=pd.DataFrame()\n",
    "    df_label['Kommentar']=text_list\n",
    "    df_label['label']=label\n",
    "\n",
    "    df_label.to_csv(path4+'label_hatespeech_round_un_definition_{0}.csv'.format(k))\n",
    "    return df_label\n",
    "\n",
    "def run_chatgpt_default(instruction, my_text, k, retry_limit=5):\n",
    "\n",
    "    label = []\n",
    "    text_list = []\n",
    "    content = str(instruction)\n",
    "\n",
    "    for Kommentar in my_text:\n",
    "        i = my_text.index(Kommentar)\n",
    "        print(i)\n",
    "\n",
    "        retry_count = 0\n",
    "        while True:\n",
    "            try:\n",
    "                completion = client.chat.completions.create(\n",
    "                    model=\"gpt-4o-mini-2024-07-18\",\n",
    "\n",
    "                    messages=[\n",
    "                        {\n",
    "                            \"role\": \"system\", \n",
    "                            \"content\": content\n",
    "                        },\n",
    "                        {\n",
    "                            \"role\": \"user\",\n",
    "                            \"content\": \"Here's the comment I picked, please label it as instructed as hate speech, or not and if there is hate speech also return the target group: \"+ Kommentar   \n",
    "\n",
    "                        }\n",
    "                    ]\n",
    "                )\n",
    "                break\n",
    "            except openai.RateLimitError as e:\n",
    "                retry_count += 1\n",
    "                if retry_count > retry_limit:\n",
    "                    raise e\n",
    "                wait_time = 60  # seconds\n",
    "                print(\n",
    "                    f\"Rate limit exceeded. Retrying in {wait_time} seconds... (Retry count: {retry_count})\"\n",
    "                )\n",
    "                time.sleep(wait_time)\n",
    "\n",
    "        label.append(completion.choices[0].message.content)\n",
    "        text_list.append(Kommentar)\n",
    "        print(completion.choices[0].message.content)\n",
    "\n",
    "    df_label = pd.DataFrame({\"Kommentar\": text_list, \"label\": label})\n",
    "    df_label.to_csv(path4 + f\"label_hatespeech_round_un_definition_{k}.csv\")\n",
    "\n",
    "    return df_label\n",
    "\n",
    "\n",
    "def evaluate(df_label):\n",
    "\n",
    "    df=pd.read_csv(path+'hatespeech_deciles_readytouse.csv')\n",
    "\n",
    "    print(df.shape)\n",
    "\n",
    "    df_final1=pd.concat([df,df_label],1)\n",
    "\n",
    "\n",
    "    df_final1=df_final1.dropna(subset='label')\n",
    "    df_final1['label']=df_final1['label'].apply(lambda x:find_label(x))\n",
    "\n",
    "    print('****************************\\n ChatGPT labelling value count:')\n",
    "    print('label: \\n',df_final1['label'].value_counts())\n",
    "\n",
    "\n",
    "    print('****************************\\n intersection with RA labelling:')\n",
    "    print(df_final1['intersection'].value_counts(normalize=True))\n",
    "\n",
    "\n",
    "    y_test=df_final1['check_hs']\n",
    "    prediction=df_final1['label']\n",
    "\n",
    "    print(confusion_matrix(y_test, prediction))\n",
    "    print(classification_report(y_test, prediction))\n",
    "\n",
    "\n",
    "    return df_final1\n",
    "\n",
    "\n",
    "\n",
    "def compare(df1,df2,l1,l2):\n",
    "    df_compare=pd.concat([df1,df2],1)\n",
    "\n",
    "    df_compare['label_intersection']=df_compare.apply(lambda x: find_intersections(x[l1],x[l2]),axis=1)\n",
    "\n",
    "    return (df_compare['label_intersection'].value_counts(normalize=True))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('\\n Task 1: Round 3 with default temperature:')\n",
    "df_label1=run_chatgpt_default(instruction,my_text,1)\n",
    "#df_final1=evaluate(df_label1)\n",
    "#df_final1=df_final1.rename(columns={'label':'label1'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('\\n Task 2: Round 3 with default temperature:')\n",
    "df_label1=run_chatgpt_default(instruction,my_text,2)\n",
    "#df_final1=evaluate(df_label1)\n",
    "#df_final1=df_final1.rename(columns={'label':'label1'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('\\n Task 3: Round 3 with default temperature:')\n",
    "df_label1=run_chatgpt_default(instruction,my_text,3)\n",
    "#df_final1=evaluate(df_label1)\n",
    "#df_final1=df_final1.rename(columns={'label':'label1'})"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "chatGPT",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
